import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
flight_data = pd.read_csv('Flight_data.csv')
survey_data = pd.read_csv('Survey_data.csv')
Unit of observation
flight_data.shape #Dataset is made up of 90917 obs and 9 variables
survey_data.shape #Dataset is made up of 90917 obs and 16 variables
#length of data is the same as the other but different variables
(90917, 16)
flight_data.info()
# CustomerID, Age,Flight_Distance,DepartureDelayin_Mins are recorded as Integer, Arrivaldelayin_Mins is seen as float
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90917 entries, 0 to 90916 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 90917 non-null int64 1 Gender 90917 non-null object 2 CustomerType 81818 non-null object 3 Age 90917 non-null int64 4 TypeTravel 81829 non-null object 5 Class 90917 non-null object 6 Flight_Distance 90917 non-null int64 7 DepartureDelayin_Mins 90917 non-null int64 8 ArrivalDelayin_Mins 90633 non-null float64 dtypes: float64(1), int64(4), object(4) memory usage: 6.2+ MB
survey_data.info()
# CustomerId is the only Integer,
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90917 entries, 0 to 90916 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerId 90917 non-null int64 1 Satisfaction 90917 non-null object 2 Seat_comfort 90917 non-null object 3 Departure.Arrival.time_convenient 82673 non-null object 4 Food_drink 82736 non-null object 5 Gate_location 90917 non-null object 6 Inflightwifi_service 90917 non-null object 7 Inflight_entertainment 90917 non-null object 8 Online_support 90917 non-null object 9 Ease_of_Onlinebooking 90917 non-null object 10 Onboard_service 83738 non-null object 11 Leg_room_service 90917 non-null object 12 Baggage_handling 90917 non-null object 13 Checkin_service 90917 non-null object 14 Cleanliness 90917 non-null object 15 Online_boarding 90917 non-null object dtypes: int64(1), object(15) memory usage: 11.1+ MB
survey_data.head()
| CustomerID | Satisfaction | Seat_comfort | Departure.Arrival.time_convenient | Food_drink | Gate_location | Inflightwifi_service | Inflight_entertainment | Online_support | Ease_of_Onlinebooking | Onboard_service | Leg_room_service | Baggage_handling | Checkin_service | Cleanliness | Online_boarding | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 149965 | satisfied | extremely poor | extremely poor | extremely poor | need improvement | need improvement | good | need improvement | acceptable | acceptable | extremely poor | acceptable | excellent | acceptable | need improvement |
| 1 | 149966 | satisfied | extremely poor | extremely poor | extremely poor | manageable | need improvement | extremely poor | need improvement | need improvement | NaN | acceptable | good | good | good | need improvement |
| 2 | 149967 | satisfied | extremely poor | NaN | extremely poor | manageable | acceptable | good | acceptable | poor | poor | extremely poor | poor | good | poor | acceptable |
| 3 | 149968 | satisfied | extremely poor | extremely poor | extremely poor | manageable | good | acceptable | good | need improvement | need improvement | extremely poor | need improvement | good | need improvement | excellent |
| 4 | 149969 | satisfied | extremely poor | extremely poor | extremely poor | manageable | need improvement | extremely poor | need improvement | need improvement | excellent | good | excellent | excellent | good | need improvement |
flight_data.head()
| CustomerID | Gender | CustomerType | Age | TypeTravel | Class | Flight_Distance | DepartureDelayin_Mins | ArrivalDelayin_Mins | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 149965 | Female | Loyal Customer | 65 | Personal Travel | Eco | 265 | 0 | 0.0 |
| 1 | 149966 | Female | Loyal Customer | 15 | Personal Travel | Eco | 2138 | 0 | 0.0 |
| 2 | 149967 | Female | Loyal Customer | 60 | Personal Travel | Eco | 623 | 0 | 0.0 |
| 3 | 149968 | Female | Loyal Customer | 70 | Personal Travel | Eco | 354 | 0 | 0.0 |
| 4 | 149969 | Male | Loyal Customer | 30 | NaN | Eco | 1894 | 0 | 0.0 |
flight_data.columns
Index(['CustomerID', 'Gender', 'CustomerType', 'Age', 'TypeTravel', 'Class',
'Flight_Distance', 'DepartureDelayin_Mins', 'ArrivalDelayin_Mins'],
dtype='object')
survey_data = survey_data.rename(columns={'CustomerId':'CustomerID'})
#renaming the Customerid column in survey data to CustomerID for an easy merge by=CustomerID
survey_data.columns
#to see if the column name changed
Index(['CustomerID', 'Satisfaction', 'Seat_comfort',
'Departure.Arrival.time_convenient', 'Food_drink', 'Gate_location',
'Inflightwifi_service', 'Inflight_entertainment', 'Online_support',
'Ease_of_Onlinebooking', 'Onboard_service', 'Leg_room_service',
'Baggage_handling', 'Checkin_service', 'Cleanliness',
'Online_boarding'],
dtype='object')
#merge datasets as one dataset
airline_data = pd.merge(flight_data,survey_data)
airline_data.shape
#new data is now 90917 observations and 24 variables
(90917, 24)
airline_data.columns
Index(['CustomerID', 'Gender', 'CustomerType', 'Age', 'TypeTravel', 'Class',
'Flight_Distance', 'DepartureDelayin_Mins', 'ArrivalDelayin_Mins',
'Satisfaction', 'Seat_comfort', 'Departure.Arrival.time_convenient',
'Food_drink', 'Gate_location', 'Inflightwifi_service',
'Inflight_entertainment', 'Online_support', 'Ease_of_Onlinebooking',
'Onboard_service', 'Leg_room_service', 'Baggage_handling',
'Checkin_service', 'Cleanliness', 'Online_boarding'],
dtype='object')
airline_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 90917 entries, 0 to 90916 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 90917 non-null int64 1 Gender 90917 non-null object 2 CustomerType 81818 non-null object 3 Age 90917 non-null int64 4 TypeTravel 81829 non-null object 5 Class 90917 non-null object 6 Flight_Distance 90917 non-null int64 7 DepartureDelayin_Mins 90917 non-null int64 8 ArrivalDelayin_Mins 90633 non-null float64 9 Satisfaction 90917 non-null object 10 Seat_comfort 90917 non-null object 11 Departure.Arrival.time_convenient 82673 non-null object 12 Food_drink 82736 non-null object 13 Gate_location 90917 non-null object 14 Inflightwifi_service 90917 non-null object 15 Inflight_entertainment 90917 non-null object 16 Online_support 90917 non-null object 17 Ease_of_Onlinebooking 90917 non-null object 18 Onboard_service 83738 non-null object 19 Leg_room_service 90917 non-null object 20 Baggage_handling 90917 non-null object 21 Checkin_service 90917 non-null object 22 Cleanliness 90917 non-null object 23 Online_boarding 90917 non-null object dtypes: float64(1), int64(4), object(19) memory usage: 17.3+ MB
#Changing from object to category
airline_data["CustomerID"]=airline_data["CustomerID"].astype("category")
airline_data["Gender"]=airline_data["Gender"].astype("category")
airline_data["CustomerType"]=airline_data["CustomerType"].astype("category")
airline_data["TypeTravel"]=airline_data["TypeTravel"].astype("category")
airline_data["Class"]=airline_data["Class"].astype("category")
airline_data["Satisfaction"]=airline_data["Satisfaction"].astype("category")
airline_data["Seat_comfort"]=airline_data["Seat_comfort"].astype("category")
airline_data["Gate_location"]=airline_data["Gate_location"].astype("category")
airline_data["Food_drink"]=airline_data["Food_drink"].astype("category")
airline_data["Inflightwifi_service"]=airline_data["Inflightwifi_service"].astype("category")
airline_data["Inflight_entertainment"]=airline_data["Inflight_entertainment"].astype("category")
airline_data["Online_support"]=airline_data["Online_support"].astype("category")
airline_data["Ease_of_Onlinebooking"]=airline_data["Ease_of_Onlinebooking"].astype("category")
airline_data["Onboard_service"]=airline_data["Onboard_service"].astype("category")
airline_data["Leg_room_service"]=airline_data["Leg_room_service"].astype("category")
airline_data["Baggage_handling"]=airline_data["Baggage_handling"].astype("category")
airline_data["Checkin_service"]=airline_data["Checkin_service"].astype("category")
airline_data["Cleanliness"]=airline_data["Cleanliness"].astype("category")
airline_data["Online_boarding"]=airline_data["Online_boarding"].astype("category")
airline_data["Departure.Arrival.time_convenient"]=airline_data["Departure.Arrival.time_convenient"].astype("category")
airline_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 90917 entries, 0 to 90916 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 90917 non-null category 1 Gender 90917 non-null category 2 CustomerType 81818 non-null category 3 Age 90917 non-null int64 4 TypeTravel 81829 non-null category 5 Class 90917 non-null category 6 Flight_Distance 90917 non-null int64 7 DepartureDelayin_Mins 90917 non-null int64 8 ArrivalDelayin_Mins 90633 non-null float64 9 Satisfaction 90917 non-null category 10 Seat_comfort 90917 non-null category 11 Departure.Arrival.time_convenient 82673 non-null category 12 Food_drink 82736 non-null category 13 Gate_location 90917 non-null category 14 Inflightwifi_service 90917 non-null category 15 Inflight_entertainment 90917 non-null category 16 Online_support 90917 non-null category 17 Ease_of_Onlinebooking 90917 non-null category 18 Onboard_service 83738 non-null category 19 Leg_room_service 90917 non-null category 20 Baggage_handling 90917 non-null category 21 Checkin_service 90917 non-null category 22 Cleanliness 90917 non-null category 23 Online_boarding 90917 non-null category dtypes: category(20), float64(1), int64(3) memory usage: 8.7 MB
airline_data.head()
| CustomerID | Gender | CustomerType | Age | TypeTravel | Class | Flight_Distance | DepartureDelayin_Mins | ArrivalDelayin_Mins | Satisfaction | ... | Inflightwifi_service | Inflight_entertainment | Online_support | Ease_of_Onlinebooking | Onboard_service | Leg_room_service | Baggage_handling | Checkin_service | Cleanliness | Online_boarding | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 149965 | Female | Loyal Customer | 65 | Personal Travel | 1 | 265 | 0 | 0.0 | satisfied | ... | need improvement | good | need improvement | acceptable | acceptable | extremely poor | acceptable | excellent | acceptable | need improvement |
| 1 | 149966 | Female | Loyal Customer | 15 | Personal Travel | 1 | 2138 | 0 | 0.0 | satisfied | ... | need improvement | extremely poor | need improvement | need improvement | NaN | acceptable | good | good | good | need improvement |
| 2 | 149967 | Female | Loyal Customer | 60 | Personal Travel | 1 | 623 | 0 | 0.0 | satisfied | ... | acceptable | good | acceptable | poor | poor | extremely poor | poor | good | poor | acceptable |
| 3 | 149968 | Female | Loyal Customer | 70 | Personal Travel | 1 | 354 | 0 | 0.0 | satisfied | ... | good | acceptable | good | need improvement | need improvement | extremely poor | need improvement | good | need improvement | excellent |
| 4 | 149969 | Male | Loyal Customer | 30 | NaN | 1 | 1894 | 0 | 0.0 | satisfied | ... | need improvement | extremely poor | need improvement | need improvement | excellent | good | excellent | excellent | good | need improvement |
5 rows × 24 columns
# looking at value counts for non-numeric features
num_to_display = 10 # defining this up here so it's easy to change later if I want
for colname in airline_data.dtypes[airline_data.dtypes == 'category'].index:
val_counts = airline_data[colname].value_counts(dropna=False) # i want to see NA counts
print(val_counts[:num_to_display])
if len(val_counts) > num_to_display:
print(f'Only displaying first {num_to_display} of {len(val_counts)} values.')
print('\n\n') # just for more space between
240881 1 180240 1 180264 1 180265 1 180266 1 180267 1 180268 1 180269 1 180270 1 180271 1 Name: CustomerID, dtype: int64 Only displaying first 10 of 90917 values. Female 46186 Male 44731 Name: Gender, dtype: int64 Loyal Customer 66897 disloyal Customer 14921 NaN 9099 Name: CustomerType, dtype: int64 Business travel 56481 Personal Travel 25348 NaN 9088 Name: TypeTravel, dtype: int64 Business 43535 Eco 40758 Eco Plus 6624 Name: Class, dtype: int64 satisfied 49761 neutral or dissatisfied 41156 Name: Satisfaction, dtype: int64 acceptable 20552 need improvement 20002 good 19789 poor 14687 excellent 12519 extremely poor 3368 Name: Seat_comfort, dtype: int64 good 18840 excellent 17079 acceptable 14806 need improvement 14539 poor 13210 NaN 8244 extremely poor 4199 Name: Departure.Arrival.time_convenient, dtype: int64 acceptable 17991 need improvement 17359 good 17245 poor 13400 excellent 12947 NaN 8181 extremely poor 3794 Name: Food_drink, dtype: int64 manageable 23385 Convinient 21088 need improvement 17113 Inconvinient 15876 very convinient 13454 very inconvinient 1 Name: Gate_location, dtype: int64 good 22159 excellent 20258 acceptable 19199 need improvement 18894 poor 10311 extremely poor 96 Name: Inflightwifi_service, dtype: int64 good 29373 excellent 20786 acceptable 16995 need improvement 13527 poor 8198 extremely poor 2038 Name: Inflight_entertainment, dtype: int64 good 29042 excellent 24916 acceptable 15090 need improvement 12063 poor 9805 extremely poor 1 Name: Online_support, dtype: int64 good 27993 excellent 23960 acceptable 15686 need improvement 13896 poor 9370 extremely poor 12 Name: Ease_of_Onlinebooking, dtype: int64 good 26373 excellent 20396 acceptable 17411 need improvement 11018 poor 8537 NaN 7179 extremely poor 3 Name: Onboard_service, dtype: int64 good 27814 excellent 24071 acceptable 15775 need improvement 15156 poor 7779 extremely poor 322 Name: Leg_room_service, dtype: int64 good 33822 excellent 25002 acceptable 17233 need improvement 9301 poor 5559 Name: Baggage_handling, dtype: int64 good 25483 acceptable 24941 excellent 18918 need improvement 10813 poor 10761 extremely poor 1 Name: Checkin_service, dtype: int64 good 34246 excellent 25079 acceptable 16930 need improvement 9283 poor 5375 extremely poor 4 Name: Cleanliness, dtype: int64 good 24676 acceptable 21427 excellent 20993 need improvement 13035 poor 10777 extremely poor 9 Name: Online_boarding, dtype: int64
Relevel the categorical variables without missing values to reduce the columns of dummy variables
#Class
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Class']= number.fit_transform(airline_data['Class'].astype('str'))
airline_data.Class.unique()
array([1, 0, 2])
#Seat_comfort
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Seat_comfort']= number.fit_transform(airline_data['Seat_comfort'].astype('str'))
airline_data.Seat_comfort.unique()
array([2, 5, 3, 1, 4, 0])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Gate_location']= number.fit_transform(airline_data['Gate_location'].astype('str'))
airline_data.Gate_location.unique()
array([3, 2, 0, 1, 4, 5])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Inflightwifi_service']= number.fit_transform(airline_data['Inflightwifi_service'].astype('str'))
airline_data.Inflightwifi_service.unique()
array([4, 0, 3, 1, 5, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Inflight_entertainment']= number.fit_transform(airline_data['Inflight_entertainment'].astype('str'))
airline_data.Inflight_entertainment.unique()
array([3, 2, 0, 1, 4, 5])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Online_support']= number.fit_transform(airline_data['Online_support'].astype('str'))
airline_data.Online_support.unique()
array([4, 0, 3, 1, 5, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Ease_of_Onlinebooking']= number.fit_transform(airline_data['Ease_of_Onlinebooking'].astype('str'))
airline_data.Ease_of_Onlinebooking.unique()
array([0, 4, 5, 1, 3, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Leg_room_service']= number.fit_transform(airline_data['Leg_room_service'].astype('str'))
airline_data.Leg_room_service.unique()
array([2, 0, 3, 4, 5, 1])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Baggage_handling']= number.fit_transform(airline_data['Baggage_handling'].astype('str'))
airline_data.Baggage_handling.unique()
array([0, 2, 4, 3, 1])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Checkin_service']= number.fit_transform(airline_data['Checkin_service'].astype('str'))
airline_data.Checkin_service.unique()
array([1, 3, 0, 4, 5, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Cleanliness']= number.fit_transform(airline_data['Cleanliness'].astype('str'))
airline_data.Cleanliness.unique()
array([0, 3, 5, 4, 1, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Online_boarding']= number.fit_transform(airline_data['Online_boarding'].astype('str'))
airline_data.Online_boarding.unique()
array([4, 0, 1, 5, 3, 2])
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
airline_data['Gender']= number.fit_transform(airline_data['Gender'].astype('str'))
airline_data.Gender.unique()
array([0, 1])
airline_data.head()
| CustomerID | Gender | CustomerType | Age | TypeTravel | Class | Flight_Distance | DepartureDelayin_Mins | ArrivalDelayin_Mins | Satisfaction | ... | Inflightwifi_service | Inflight_entertainment | Online_support | Ease_of_Onlinebooking | Onboard_service | Leg_room_service | Baggage_handling | Checkin_service | Cleanliness | Online_boarding | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 149965 | 0 | Loyal Customer | 65 | Personal Travel | 1 | 265 | 0 | 0.0 | satisfied | ... | 4 | 3 | 4 | 0 | acceptable | 2 | 0 | 1 | 0 | 4 |
| 1 | 149966 | 0 | Loyal Customer | 15 | Personal Travel | 1 | 2138 | 0 | 0.0 | satisfied | ... | 4 | 2 | 4 | 4 | NaN | 0 | 2 | 3 | 3 | 4 |
| 2 | 149967 | 0 | Loyal Customer | 60 | Personal Travel | 1 | 623 | 0 | 0.0 | satisfied | ... | 0 | 3 | 0 | 5 | poor | 2 | 4 | 3 | 5 | 0 |
| 3 | 149968 | 0 | Loyal Customer | 70 | Personal Travel | 1 | 354 | 0 | 0.0 | satisfied | ... | 3 | 0 | 3 | 4 | need improvement | 2 | 3 | 3 | 4 | 1 |
| 4 | 149969 | 1 | Loyal Customer | 30 | NaN | 1 | 1894 | 0 | 0.0 | satisfied | ... | 4 | 2 | 4 | 4 | excellent | 3 | 1 | 1 | 3 | 4 |
5 rows × 24 columns
airline_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 90917 entries, 0 to 90916 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 90917 non-null category 1 Gender 90917 non-null int32 2 CustomerType 81818 non-null category 3 Age 90917 non-null int64 4 TypeTravel 81829 non-null category 5 Class 90917 non-null int32 6 Flight_Distance 90917 non-null int64 7 DepartureDelayin_Mins 90917 non-null int64 8 ArrivalDelayin_Mins 90633 non-null float64 9 Satisfaction 90917 non-null category 10 Seat_comfort 90917 non-null int32 11 Departure.Arrival.time_convenient 82673 non-null category 12 Food_drink 82736 non-null category 13 Gate_location 90917 non-null int32 14 Inflightwifi_service 90917 non-null int32 15 Inflight_entertainment 90917 non-null int32 16 Online_support 90917 non-null int32 17 Ease_of_Onlinebooking 90917 non-null int32 18 Onboard_service 83738 non-null category 19 Leg_room_service 90917 non-null int32 20 Baggage_handling 90917 non-null int32 21 Checkin_service 90917 non-null int32 22 Cleanliness 90917 non-null int32 23 Online_boarding 90917 non-null int32 dtypes: category(7), float64(1), int32(13), int64(3) memory usage: 12.0 MB
#Changing from object to category
airline_data["CustomerID"]=airline_data["CustomerID"].astype("category")
airline_data["Gender"]=airline_data["Gender"].astype("category")
airline_data["CustomerType"]=airline_data["CustomerType"].astype("category")
airline_data["TypeTravel"]=airline_data["TypeTravel"].astype("category")
airline_data["Class"]=airline_data["Class"].astype("category")
airline_data["Satisfaction"]=airline_data["Satisfaction"].astype("category")
airline_data["Seat_comfort"]=airline_data["Seat_comfort"].astype("category")
airline_data["Gate_location"]=airline_data["Gate_location"].astype("category")
airline_data["Food_drink"]=airline_data["Food_drink"].astype("category")
airline_data["Inflightwifi_service"]=airline_data["Inflightwifi_service"].astype("category")
airline_data["Inflight_entertainment"]=airline_data["Inflight_entertainment"].astype("category")
airline_data["Online_support"]=airline_data["Online_support"].astype("category")
airline_data["Ease_of_Onlinebooking"]=airline_data["Ease_of_Onlinebooking"].astype("category")
airline_data["Onboard_service"]=airline_data["Onboard_service"].astype("category")
airline_data["Leg_room_service"]=airline_data["Leg_room_service"].astype("category")
airline_data["Baggage_handling"]=airline_data["Baggage_handling"].astype("category")
airline_data["Checkin_service"]=airline_data["Checkin_service"].astype("category")
airline_data["Cleanliness"]=airline_data["Cleanliness"].astype("category")
airline_data["Online_boarding"]=airline_data["Online_boarding"].astype("category")
airline_data["Departure.Arrival.time_convenient"]=airline_data["Departure.Arrival.time_convenient"].astype("category")
# looking at value counts for releveled categories
num_to_display = 10 # defining this up here so it's easy to change later if I want
for colname in airline_data.dtypes[airline_data.dtypes == 'category'].index:
val_counts = airline_data[colname].value_counts(dropna=False) # i want to see NA counts
print(val_counts[:num_to_display])
if len(val_counts) > num_to_display:
print(f'Only displaying first {num_to_display} of {len(val_counts)} values.')
print('\n\n') # just for more space between
240881 1 180240 1 180264 1 180265 1 180266 1 180267 1 180268 1 180269 1 180270 1 180271 1 Name: CustomerID, dtype: int64 Only displaying first 10 of 90917 values. 0 46186 1 44731 Name: Gender, dtype: int64 Loyal Customer 66897 disloyal Customer 14921 NaN 9099 Name: CustomerType, dtype: int64 Business travel 56481 Personal Travel 25348 NaN 9088 Name: TypeTravel, dtype: int64 0 43535 1 40758 2 6624 Name: Class, dtype: int64 satisfied 49761 neutral or dissatisfied 41156 Name: Satisfaction, dtype: int64 0 20552 4 20002 3 19789 5 14687 1 12519 2 3368 Name: Seat_comfort, dtype: int64 good 18840 excellent 17079 acceptable 14806 need improvement 14539 poor 13210 NaN 8244 extremely poor 4199 Name: Departure.Arrival.time_convenient, dtype: int64 acceptable 17991 need improvement 17359 good 17245 poor 13400 excellent 12947 NaN 8181 extremely poor 3794 Name: Food_drink, dtype: int64 2 23385 0 21088 3 17113 1 15876 4 13454 5 1 Name: Gate_location, dtype: int64 3 22159 1 20258 0 19199 4 18894 5 10311 2 96 Name: Inflightwifi_service, dtype: int64 3 29373 1 20786 0 16995 4 13527 5 8198 2 2038 Name: Inflight_entertainment, dtype: int64 3 29042 1 24916 0 15090 4 12063 5 9805 2 1 Name: Online_support, dtype: int64 3 27993 1 23960 0 15686 4 13896 5 9370 2 12 Name: Ease_of_Onlinebooking, dtype: int64 good 26373 excellent 20396 acceptable 17411 need improvement 11018 poor 8537 NaN 7179 extremely poor 3 Name: Onboard_service, dtype: int64 3 27814 1 24071 0 15775 4 15156 5 7779 2 322 Name: Leg_room_service, dtype: int64 2 33822 1 25002 0 17233 3 9301 4 5559 Name: Baggage_handling, dtype: int64 3 25483 0 24941 1 18918 4 10813 5 10761 2 1 Name: Checkin_service, dtype: int64 3 34246 1 25079 0 16930 4 9283 5 5375 2 4 Name: Cleanliness, dtype: int64 3 24676 0 21427 1 20993 4 13035 5 10777 2 9 Name: Online_boarding, dtype: int64
airline_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 90917 entries, 0 to 90916 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 90917 non-null category 1 Gender 90917 non-null int32 2 CustomerType 81818 non-null category 3 Age 90917 non-null int64 4 TypeTravel 81829 non-null category 5 Class 90917 non-null int32 6 Flight_Distance 90917 non-null int64 7 DepartureDelayin_Mins 90917 non-null int64 8 ArrivalDelayin_Mins 90633 non-null float64 9 Satisfaction 90917 non-null category 10 Seat_comfort 90917 non-null int32 11 Departure.Arrival.time_convenient 82673 non-null category 12 Food_drink 82736 non-null category 13 Gate_location 90917 non-null int32 14 Inflightwifi_service 90917 non-null int32 15 Inflight_entertainment 90917 non-null int32 16 Online_support 90917 non-null int32 17 Ease_of_Onlinebooking 90917 non-null int32 18 Onboard_service 83738 non-null category 19 Leg_room_service 90917 non-null int32 20 Baggage_handling 90917 non-null int32 21 Checkin_service 90917 non-null int32 22 Cleanliness 90917 non-null int32 23 Online_boarding 90917 non-null int32 dtypes: category(7), float64(1), int32(13), int64(3) memory usage: 12.0 MB
import pandas_profiling
airline_data.profile_report()